#!/usr/bin/env python
#coding:utf-8
#author:dingyangfan
#url:http://www.meiyear.com

import redis
from lib.tag import *
from lib.config import *
from lib.thing import ThreadPool
import multiprocessing,threading
import sys,urlparse
from pyquery import PyQuery as pq
socket.setdefaulttimeout(settings.timeout)
tt = []
info = {}
def replacebody(strbody):
    if info.has_key('rebody'):
        for i in info['rebody']:
            strbody = strbody.replace(i[0],i[1])
    return strbody
def recontent(strcontent):
    if info.has_key('recontent'):
        for i in info['recontent']:
            strcontent = strcontent.replace(i[0],i[1])
    return strcontent
def searchindex(url):
    try:
        req = urllib2.Request(url,headers=info['headers'] if info.has_key('headers') else settings.headers)
        tt = urllib2.urlopen(req).read()
        toplists = set(re.compile(info['urlpage']).findall(tt))
        return toplists
    except Exception:
        return []
def fetchpage(page,id,conn):
    sid = 'l_%s'%info['sid']
    try:
        content = urllib2.urlopen(page).read().decode(info['pagechatset'],'ignore')
        time.sleep(info['time_s'] if info.has_key('time_s') else settings.time_s)
    except urllib2.URLError:
        logmsg('page失败1:%s' % page,info['f'])
        info['r'].srem(sid,id)
        return None
    except Exception:
        logmsg('page失败2:%s' % page,info['f'])
        info['r'].srem(sid,id)
        return None
    for i in info['unkownpagetag']:
        if content.find(i) != -1:
            return False
    try:
        title = re.compile(info['titletag'],re.DOTALL).search(content).group(1)
    except AttributeError:
        logmsg('title失败:%s' % page,info['f'])
        info['r'].srem(sid,id)
        return None
    if info.has_key('clicktag'):
        try:
            clicknum = re.compile(info['clicktag']).search(content).group(1)
        except UnicodeEncodeError:
            clicknum = 0
    else:
        clicknum = 0
    if info.has_key('replaytag'):
        try:
            replaynum = re.compile(info['replaytag']).search(content).group(1)
        except UnicodeEncodeError:
            replaynum = 0
        except AttributeError:
            replaynum = 0
    else:
        replaynum = 0
    try:
        name = re.compile(info['nametag'],re.DOTALL).search(content).group(2)
    except AttributeError:
        name = ' '
    try:
        if info.has_key('doubanframe'):
            tt__ = pq(content)
            time.sleep(0.3)
            doubancontent = urllib2.urlopen(urlparse.urljoin('http://www.douban.com/',tt__('iframe').attr('src'))).read().decode(info['pagechatset'],'ignore')
            ttype = re.compile(info['ttypetag']).search(doubancontent).group(2)
        else:
            ttype = re.compile(info['ttypetag']).search(content).group(2)
        if info['t_type'].has_key(ttype):
            ttype = info['t_type'].get(ttype)
        else:
            ttype = 0
    except AttributeError:
        ttype = 0
    except urllib2.URLError:
        ttype = 0
    except Exception:
        ttype = 0
    try:
        dateline = re.compile(info['datelinetag']).search(content).group(1)
        if info.has_key('timet') is False:
            dateline = time.mktime(time.strptime(dateline,info['dateformat'] if info.has_key('dateformat') else settings.dateformat))
        else:
            dateline = int(dateline)
    except AttributeError:
        dateline =  int(time.time())
    try:
        if info.has_key('bodytag'):
            content1 = re.compile(info['bodytag'],re.DOTALL).search(content).group(1)
        elif info.has_key('bodytag1'):
            jq = pq(content)
            content1 = jq(info['bodytag1']).html()
    except AttributeError:
        logmsg('content失败:%s' % page,info['f'])
        info['r'].srem(sid,id)
        return None
    url = page
    content1 = recontent(content1)
    content1 = deltag(replacebody(content1))
    litpic = getimg(content1)
    title = deltag(title)
    tid = db_insert_thread(title,name,info['typen'],ttype,dateline,clicknum,replaynum,content1,url,litpic,conn)
    if tid ==0: return None
    fetchpagethreads(id,name,tid,ttype,content,conn)
def fetchpagethreads(id,name,tid,ttype,content,conn,t=False,p=None):
    if t is False and content.find(info['endpagetag']) != -1:
        try:
            pagecount = int(eval(info['pagecounttag']))
        except Exception:
            pagecount = 0
        step =info['step'] if info.has_key('step') else settings.step
        startvpage = info['startvpage'] if info.has_key('startvpage') else settings.startvpage
        pagecount = int(pagecount+step)
        au = 0
        for i in range(int(startvpage),pagecount,int(step)):
            if(info.has_key('pagepid')):
                for u in info['pagepid']:
                    id = id.replace(u[0],u[1])
                p = info['vpage'] % (id % i)
            else:
                p = info['vpage'] % (id,i)
            logmsg('%s(%s/%s)'%(p,i,pagecount-step),info['f'])
            try:
                time.sleep(info['time_s'] if info.has_key('time_s') else settings.time_s)
                content = urllib2.urlopen(p).read().decode(info['pagechatset'],'ignore')
            except urllib2.URLError:
                time.sleep(info['time_s'] if info.has_key('time_s') else settings.time_s)
                content = vpageurl(p,info['f'],info['pagechatset'])
            except Exception:
                time.sleep(info['time_s'] if info.has_key('time_s') else settings.time_s)
                content = vpageurl(p,info['f'],info['pagechatset'])
            if content == -1:
                continue
            fetchpagethreads(id,name,tid,ttype,content,conn,True,p)
            au +=1
        c = conn.cursor()
        c.execute("""update ti_thread set `u`=%s  where id=%s"""%(au,tid))
        c.close()
    if(info.has_key('commenttag')):
        try:
            r = HTML.document_fromstring(recontent(content))
        except ValueError:
            logmsg('vpage content失败：%s'%p,info['f'])
            return None
        node = r.xpath(info['commenttag'][0])
        names = []
        times = []
        commentbody = []
        for i in node:
            for j in info['commenttag'][1]:
                nntag = i.find(info['commenttag'][1][j])
                if nntag is None:
                    if(j == 'name'):
                        names.append('')
                    elif (j =='time'):
                        times.append('')
                    elif (j =='body'):
                        commentbody.append(' ')
                    continue
                if (j == 'body'):
                    try:
                        data = '%s'%nntag.text + ''.join([HTML.tostring(y,encoding=info['pagechatset']).strip() for y in nntag.getchildren()]).decode(info['pagechatset'])
                    except TypeError:
                        data = nntag.text
                    except UnicodeDecodeError:
                        data = nntag.text
                    except AttributeError:
                        data = ' '
                else:
                    data = nntag.text
                if(j == 'name'):
                    data = data if len(data) < 30 else data[0:30]
                    names.append(data)
                elif (j =='time'):
                    times.append(str(data).strip())
                elif (j =='body'):
                    commentbody.append(data)
    else:
        names = []
        if(info.has_key('commentnametag')):
            names = eval(info['commentnametag'])
        times = eval(info['commenttimestag'])
        commentbody = eval(info['commentbody'])
    j = 0
    c = conn.cursor()

    for i in commentbody:
        if i is None: continue
        if len(times) > 0:
            try:
                if info.has_key('timet') is False:
                    dateline = time.mktime(time.strptime(deltag(times[j]),info['dateformat'] if info.has_key('dateformat') else settings.dateformat))
                else:
                    dateline = int(dateline)
            except ValueError:
                dateline =  int(time.time())
        else:
            dateline =  int(time.time())
        if info.has_key('commentnametag') or len(names)>0:
            try:
                sname = names[j]
            except ValueError:
                sname = 'admin'
        else:
            sname = ''
        j+=1

        try:
            if info.has_key('onlyname'):
                if name != sname:
                    continue
            c.execute("insert into ti_threads (`aid`,`dateline`,`body`,`tid`,`tid2`,`name`)values(%s,%s,%s,%s,%s,%s)",
                (tid,dateline,dcode(deltag(replacebody(i))),info['typen'],ttype,dcode(sname))
            )
        except Exception as e:
            print 'threads error %s'%e
            pass
    c.execute("""update ti_thread set `purl`='%s' ,`orde`=%s where id=%s"""%(p,len(commentbody),tid))
    conn.commit()
    c.close()
def nodestart():
    global info
    r = redis.Redis(host=settings.redisinfo[0], port=settings.redisinfo[1], db=settings.redisinfo[2])
    t_type = getttype()
    info['r'] = r
    info['t_type'] = t_type
    try:
        logmsg('开始时间：%s'%time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime(int(time.time()) + 8*60*60)),info['f'])
        tp = ThreadPool(3)
        for i in info['urls']:
            tp.add_job(done_start,*(i,searchindex,fetchlist,info))
            time.sleep(2)
        tp.wait_for_complete()
        logmsg('%s页面开采集开始'%info['f'],info['f'])
        thread_count =info['thread_count'] if info.has_key('thread_count') else settings.thread_count
        done_page(info['page'],fetchpage,info['f'],thread_count)
    except Exception as e:
        logmsg('错误%s'%e,info['f'])
    time.sleep(5)
    q.join()
    logmsg('完成时间：%s'%time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime(int(time.time()) + 8*60*60)),info['f'])
    logmsg('\n\n\n',info['f'])
    print u''

def subexe(i):
    global info
    info = {}
    reload(settings)
    for j in i:
        info[j] = i[j]
    nodestart()
    time.sleep(info['run_time'] if info.has_key('run_time') else settings.run_time)
    multiprocessing.Process(target=subexe,args=(i,)).run()
def ding_do():
    global tt
    if len(tt)>0:
        for i in tt:
            print u'杀死进程pid',i.pid,u' 活动状态：',i.is_alive()
            if i.is_alive():
                i.terminate()
    if len(sys.argv) > 1:
        if sys.argv[1] =='-s':
            for i in settings.ding_rule:
                if int(sys.argv[2]) == i['sid']:
                    subt = multiprocessing.Process(target=subexe,args=(i,))
                    tt.append(subt)
                    subt.start()
                    print u'进程开始运行：',i['f'],u' 起始pid为：',subt.pid
        elif sys.argv[1] =='-m':
            for i in settings.ding_rule:
                subt = multiprocessing.Process(target=subexe,args=(i,))
                tt.append(subt)
                subt.start()
                print u'进程开始运行：',i['f'],u' 起始pid为：',subt.pid
    else:
        for i in settings.ding_rule:
            subt = multiprocessing.Process(target=subexe,args=(i,))
            tt.append(subt)
            subt.start()
            print u'进程开始运行：',i['f'],u' 起始pid为：',subt.pid
    threading.Timer(24*3600,ding_do).start()
def check_ding():
    print execfile('check.py')
    threading.Timer(3600*24*7,check_ding).start()

if __name__=='__main__':
    if len(sys.argv) > 1:
        if sys.argv[1] == '-m' and sys.argv[2] == 'sohu':
            import settings_sohu as settings
        elif sys.argv[1] == '-h':
            print '--------help information----------'
            print '      -s : this is ding_rule sid example: run.py -s 152'
            print '      -m : this is module ding_rule example: runpy -m sohu'
            exit()
    ding_do()
    #check_ding()
    pass